This notebook contains all analysis done on data and on audio file such as exploratory analysis done on metadata and audio files.
import os
import json
import librosa
import numpy as np
import pandas as pd
import plotly.graph_objects as go
from glob import glob
import pickle
import sox
from tqdm import tqdm
import librosa
import librosa.display
import IPython.display as ipd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")
Link for dataset : https://www.kaggle.com/competitions/birdclef-2021
data = pd.read_csv('birdclef-2021/train_metadata.csv')
data.head()
| primary_label | secondary_labels | type | latitude | longitude | scientific_name | common_name | author | date | filename | license | rating | time | url | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | acafly | ['amegfi'] | ['begging call', 'call', 'juvenile'] | 35.3860 | -84.1250 | Empidonax virescens | Acadian Flycatcher | Mike Nelson | 2012-08-12 | XC109605.ogg | Creative Commons Attribution-NonCommercial-Sha... | 2.5 | 09:30 | https://www.xeno-canto.org/109605 |
| 1 | acafly | [] | ['call'] | 9.1334 | -79.6501 | Empidonax virescens | Acadian Flycatcher | Allen T. Chartier | 2000-12-26 | XC11209.ogg | Creative Commons Attribution-NonCommercial-Sha... | 3.0 | ? | https://www.xeno-canto.org/11209 |
| 2 | acafly | [] | ['call'] | 5.7813 | -75.7452 | Empidonax virescens | Acadian Flycatcher | Sergio Chaparro-Herrera | 2012-01-10 | XC127032.ogg | Creative Commons Attribution-NonCommercial-Sha... | 3.0 | 15:20 | https://www.xeno-canto.org/127032 |
| 3 | acafly | ['whwbec1'] | ['call'] | 4.6717 | -75.6283 | Empidonax virescens | Acadian Flycatcher | Oscar Humberto Marin-Gomez | 2009-06-19 | XC129974.ogg | Creative Commons Attribution-NonCommercial-Sha... | 3.5 | 07:50 | https://www.xeno-canto.org/129974 |
| 4 | acafly | ['whwbec1'] | ['call'] | 4.6717 | -75.6283 | Empidonax virescens | Acadian Flycatcher | Oscar Humberto Marin-Gomez | 2009-06-19 | XC129981.ogg | Creative Commons Attribution-NonCommercial-Sha... | 3.5 | 07:50 | https://www.xeno-canto.org/129981 |
# Get information of dataset
data.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 62874 entries, 0 to 62873 Data columns (total 14 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 primary_label 62874 non-null object 1 secondary_labels 62874 non-null object 2 type 62874 non-null object 3 latitude 62874 non-null float64 4 longitude 62874 non-null float64 5 scientific_name 62874 non-null object 6 common_name 62874 non-null object 7 author 62874 non-null object 8 date 62874 non-null object 9 filename 62874 non-null object 10 license 62874 non-null object 11 rating 62874 non-null float64 12 time 62874 non-null object 13 url 62874 non-null object dtypes: float64(3), object(11) memory usage: 6.7+ MB
No columns have missing values.
# Total number of species in the dataset
species_count = data['primary_label'].nunique()
print("Total number of species :", species_count)
Total number of species : 397
# Geographical distribution of data
fig = go.Figure(data=go.Scattergeo(
lon = data['longitude'],
lat = data['latitude']
))
fig.update_layout(
title = 'Distribution recordings of of bird Sound',
geo_scope='world'
)
fig.show()
# Count unique species
species = data['primary_label'].value_counts()
# Make bar chart
fig = go.Figure(data=[go.Bar(y=species.values, x=species.index)],
layout=go.Layout(margin=go.layout.Margin(l=0, r=0, b=10, t=50)))
# Show chart
fig.update_layout(title='Count of records per species(class)')
fig.show()
Rating
Rating is associated with the overall quality assigned to recordings by users. It ranges from 0.5 to 5.0 (the latter being the best possible rating)
hist_data = data['rating'].values.tolist()
fig = go.Figure(data=[go.Histogram(x=hist_data)],
layout=go.Layout(margin=go.layout.Margin(l=0, r=0, b=10, t=50)))
fig.update_layout(title='Number of recordings per rating')
fig.show()
# Drop the records with less than 3.5 rating
data = data.query('rating>=3.5')
data
| primary_label | secondary_labels | type | latitude | longitude | scientific_name | common_name | author | date | filename | license | rating | time | url | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 3 | acafly | ['whwbec1'] | ['call'] | 4.6717 | -75.6283 | Empidonax virescens | Acadian Flycatcher | Oscar Humberto Marin-Gomez | 2009-06-19 | XC129974.ogg | Creative Commons Attribution-NonCommercial-Sha... | 3.5 | 07:50 | https://www.xeno-canto.org/129974 |
| 4 | acafly | ['whwbec1'] | ['call'] | 4.6717 | -75.6283 | Empidonax virescens | Acadian Flycatcher | Oscar Humberto Marin-Gomez | 2009-06-19 | XC129981.ogg | Creative Commons Attribution-NonCommercial-Sha... | 3.5 | 07:50 | https://www.xeno-canto.org/129981 |
| 5 | acafly | [] | ['call'] | 4.6717 | -75.6283 | Empidonax virescens | Acadian Flycatcher | Oscar Humberto Marin-Gomez | 2007-11-05 | XC130056.ogg | Creative Commons Attribution-NonCommercial-Sha... | 4.0 | 07:45 | https://www.xeno-canto.org/130056 |
| 6 | acafly | [] | ['call'] | 4.6717 | -75.6283 | Empidonax virescens | Acadian Flycatcher | Oscar Humberto Marin-Gomez | 2007-11-12 | XC130133.ogg | Creative Commons Attribution-NonCommercial-Sha... | 4.0 | 07:50 | https://www.xeno-canto.org/130133 |
| 7 | acafly | [] | ['call'] | 4.6717 | -75.6283 | Empidonax virescens | Acadian Flycatcher | Oscar Humberto Marin-Gomez | 2007-11-12 | XC130140.ogg | Creative Commons Attribution-NonCommercial-Sha... | 4.0 | 14:00 | https://www.xeno-canto.org/130140 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 62868 | yetvir | ['eawpew', 'reevir1', 'amered', 'acafly'] | ['adult', 'male', 'song'] | 43.2167 | -81.9021 | Vireo flavifrons | Yellow-throated Vireo | Scott Connop | 2020-06-09 | XC583859.ogg | Creative Commons Attribution-NonCommercial-Sha... | 4.5 | 12:25 | https://www.xeno-canto.org/583859 |
| 62870 | yetvir | [] | ['life stage uncertain', 'sex uncertain', 'song'] | 42.3005 | -72.5877 | Vireo flavifrons | Yellow-throated Vireo | Christopher McPherson | 2019-05-31 | XC600085.ogg | Creative Commons Attribution-NonCommercial-Sha... | 5.0 | 09:30 | https://www.xeno-canto.org/600085 |
| 62871 | yetvir | ['amered', 'eawpew', 'norcar', 'reevir1'] | ['adult', 'male', 'song'] | 42.3005 | -72.5877 | Vireo flavifrons | Yellow-throated Vireo | Christopher McPherson | 2020-06-02 | XC602701.ogg | Creative Commons Attribution-NonCommercial-Sha... | 4.5 | 08:30 | https://www.xeno-canto.org/602701 |
| 62872 | yetvir | [] | ['uncertain'] | 32.2357 | -99.8811 | Vireo flavifrons | Yellow-throated Vireo | Brad Banner | 2019-04-27 | XC614733.ogg | Creative Commons Attribution-NonCommercial-Sha... | 4.0 | 17:30 | https://www.xeno-canto.org/614733 |
| 62873 | yetvir | ['gamqua', 'whwdov'] | ['adult', 'male', 'song'] | 31.9060 | -109.1543 | Vireo flavifrons | Yellow-throated Vireo | Richard E. Webster | 2020-05-26 | XC615888.ogg | Creative Commons Attribution-NonCommercial-Sha... | 4.5 | 06:23 | https://www.xeno-canto.org/615888 |
48886 rows × 14 columns
# data['primary_label'].unique()
data['primary_label'].value_counts()
eursta 500
houspa 500
redcro 500
sonspa 500
gbwwre1 500
...
runwre1 9
whcpar 8
crfpar 5
wegspa1 5
stvhum2 4
Name: primary_label, Length: 397, dtype: int64
# Percentage of records having less than 100 records
sum(data['primary_label'].value_counts() <= 100) / data['primary_label'].nunique()
0.5062972292191436
Drop the records having less than 100 samples
# Species having 100 sample counts
birds_count = {}
for bird_species, count in zip(data.primary_label.unique(),
data.groupby('primary_label')['primary_label'].count().values):
birds_count[bird_species] = count
most_represented_birds = [key for key,value in birds_count.items() if value >= 100]
len(most_represented_birds)
200
data = data.loc[data.primary_label.isin(most_represented_birds)]
data.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 36362 entries, 3 to 62714 Data columns (total 14 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 primary_label 36362 non-null object 1 secondary_labels 36362 non-null object 2 type 36362 non-null object 3 latitude 36362 non-null float64 4 longitude 36362 non-null float64 5 scientific_name 36362 non-null object 6 common_name 36362 non-null object 7 author 36362 non-null object 8 date 36362 non-null object 9 filename 36362 non-null object 10 license 36362 non-null object 11 rating 36362 non-null float64 12 time 36362 non-null object 13 url 36362 non-null object dtypes: float64(3), object(11) memory usage: 4.2+ MB
base_dir = 'birdclef-2021/train_short_audio'
data['full_path'] = base_dir+ '/' + data['primary_label'] + '/' + data['filename']
# Get a random sample from sonspa bird folder
sonspa = data[data['primary_label'] == "sonspa"].sample(1, random_state = 1)['full_path'].values[0]
sonspa
'birdclef-2021/train_short_audio/sonspa/XC445424.ogg'
# Play audio file using Ipython
ipd.Audio(sonspa)
librosa is a python package for music and audio analysis. It provides the building blocks necessary to create music information retrieval systems.
# Load the audio as a waveform `y`
# Store the sampling rate as `sr`
# Here, y is raw data of the audio file (audio data is a long numpy array) and sr is a integer value that indicate sampling rate
y, sr = librosa.load(sonspa)
print(f'y: {y}')
print(f'shape y: {y.shape}')
print(f'sr: {sr}')
y: [2.7304613e-05 7.4803167e-05 1.4458728e-04 ... 4.3406340e-04 9.3392772e-04 0.0000000e+00] shape y: (630035,) sr: 22050
# plt.figure(figsize = (25,10))
librosa.display.waveshow(y, sr = sr)
<librosa.display.AdaptiveWaveplot at 0x169d811f0>
STFT breaks up the audio signal into smaller sections by using a sliding time window. It takes the FFT on each section and then combines them.
# Define frame size and hop size for STFT
FRAME_SIZE = 256
HOP_SIZE = 128
S_y = librosa.stft(y, n_fft=FRAME_SIZE, hop_length=HOP_SIZE)
print(S_y.shape)
print(type(S_y[0][0]))
(129, 4923) <class 'numpy.complex64'>
A Spectrogram breaks a sound signal into smaller pieces of time and then uses the Fourier Transform on each piece to figure out the frequencies in that piece. The Fourier Transforms for all of these segments are then added together to make a single plot.
Y = np.abs(S_y) ** 2
print(Y.shape)
print(type(Y[0][0]))
(129, 4923) <class 'numpy.float32'>
def plot_spectrogram(Y, sr, hop_length, y_axis="linear"):
# plt.figure(figsize=(15, 7))
librosa.display.specshow(Y,
sr=sr,
hop_length=hop_length,
x_axis="time",
y_axis=y_axis,
cmap=plt.get_cmap('viridis'))
# plt.colorbar(format="%+2.f")
Source : https://youtube.com/playlist?list=PL-wATfeyAMNqIee7cH3q1bh4QJFAaeNv0
plot_spectrogram(Y, sr, HOP_SIZE)
Y_log = librosa.power_to_db(Y)
plot_spectrogram(Y_log, sr, HOP_SIZE)
plot_spectrogram(Y_log, sr, HOP_SIZE, y_axis="log")
A Mel Spectrogram is different from a regular Spectrogram that shows Frequency vs. Time in two different ways.
S = librosa.feature.melspectrogram(y=y,
sr=sr)
S_db_mel = librosa.power_to_db(S, ref=np.max)
plot_spectrogram(S_db_mel, sr, HOP_SIZE, y_axis="log")
def resize(audio, duration):
sig, rate = audio
num_sample = int(duration * rate)
if len(sig) < num_sample:
sig_padded = np.zeros(num_sample, dtype=sig.dtype)
sig_padded[:len(sig)] = sig
sig = sig_padded
else:
sig = sig[:num_sample]
return (sig, rate)
resized_sonspa, rate = resize((y, sr), duration = 10)
librosa.display.waveshow(resized_sonspa, sr = sr)
<librosa.display.AdaptiveWaveplot at 0x169dcaa00>
FRAME_SIZE = 1024
HOP_SIZE = 1736
S = librosa.feature.melspectrogram(y=resized_sonspa,
sr=22050,
hop_length = HOP_SIZE,
n_fft = 1024,
n_mels = 48)
S_db_mel = librosa.power_to_db(S, ref=np.max)
plot_spectrogram(S_db_mel, 22050, HOP_SIZE)
def augment_audio(audio, shift_limit = 0.5):
sig, rate = audio
shift_samples = int(np.random.random() * shift_limit * len(sig))
shifted_audio = np.roll(sig, shift_samples)
return (shifted_audio, rate)
augmented_sonspa, rate = augment_audio((resized_sonspa, rate))
librosa.display.waveshow(augmented_sonspa, sr = rate)
<librosa.display.AdaptiveWaveplot at 0x16a13f8e0>
resized_acafly, rate = resize((y, sr), duration = 10)
augmented_acafly, rate = augment_audio((resized_acafly, rate))
S = librosa.feature.melspectrogram(y=augmented_acafly,
sr=22050,
hop_length = HOP_SIZE,
n_fft = 1024,
n_mels = 48)
S_db_aug_mel = librosa.power_to_db(S, ref=np.max)
plot_spectrogram(S_db_aug_mel, sr, HOP_SIZE)
def time_mask(spec, T=10, num_masks=1, replace_with_zero=False):
cloned = spec.copy()
for i in range(0, num_masks):
t = np.random.uniform(low=0.0, high=T)
t = int(t)
t_zero = np.random.randint(low=0, high=cloned.shape[1] - t)
if replace_with_zero:
cloned[:, t_zero:t_zero + t] = 0
else:
cloned[:, t_zero:t_zero + t] = np.mean(cloned)
return cloned
def freq_mask(spec, F=20, num_masks=1, replace_with_zero=False):
cloned = spec.copy()
for i in range(0, num_masks):
f = np.random.uniform(low=0.0, high=F)
f = int(f)
f_zero = np.random.randint(low=0, high=cloned.shape[0] - f)
if replace_with_zero:
cloned[f_zero:f_zero + f, :] = 0
else:
cloned[f_zero:f_zero + f, :] = np.mean(cloned)
return cloned
spec = S
spec_time_masked = time_mask(spec)
spec_freq_masked = freq_mask(spec_time_masked)
S_db_mel_aug = librosa.power_to_db(spec_freq_masked, ref=np.max)
plot_spectrogram(S_db_mel_aug, sr, HOP_SIZE, y_axis="log")